import pandas as pd
import numpy as np
import os
import datetime
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable
from sklearn import tree
from sklearn import ensemble
import pytz
import itertools
import visualize
import utils
import pydotplus
import xgboost as xgb
from sklearn import metrics
from sklearn import model_selection
import pvlib
import pv_clf
import visualize_plotly as visualize
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)
from IPython.display import Image
%load_ext autoreload
%autoreload 2
np.set_printoptions(precision=4)
%matplotlib notebook
Only making ground predictions using PVLib clearsky model and statistical model. NSRDB model won't be available to ground measurements.
nsrdb = pd.read_pickle('abq_nsrdb_1.pkl.gz')
nsrdb.index = nsrdb.index.tz_convert('MST')
test = nsrdb[nsrdb.index >= '01-01-2014']
train = nsrdb[nsrdb.index < '01-01-2014']
clf = pv_clf.RandomForestClassifierPV()
X_train = np.asarray([train.index.values, train['GHI'].values, train['Clearsky GHI pvlib'].values]).T
y_train = train['sky_status'].values
X_test = np.asarray([test.index.values, test['GHI'].values, test['Clearsky GHI pvlib'].values]).T
y_test = test['sky_status'].values
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
vis = visualize.Visualizer()
vis.add_line_ser(test['GHI'])
vis.add_line_ser(test['Clearsky GHI pvlib'] * clf.alpha_scale)
vis.add_circle_ser(test[y_pred]['GHI'])
vis.show()
metrics.accuracy_score(y_test, y_pred)
np.bincount(y_pred) / len(y_pred), np.bincount(y_test) / len(y_test)
test = nsrdb[nsrdb.index >= '01-01-2013']
train = nsrdb[nsrdb.index < '01-01-2013']
clf = pv_clf.RandomForestClassifierPV()
X_train = np.asarray([train.index.values, train['GHI'].values, train['Clearsky GHI pvlib'].values]).T
y_train = train['sky_status'].values
X_test = np.asarray([test.index.values, test['GHI'].values, test['Clearsky GHI pvlib'].values]).T
y_test = test['sky_status'].values
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
vis = visualize.Visualizer()
vis.add_line_ser(test['GHI'])
vis.add_line_ser(test['Clearsky GHI pvlib'] * clf.alpha_scale)
vis.add_circle_ser(test[y_pred]['GHI'])
vis.show()
metrics.accuracy_score(y_test, y_pred)
np.bincount(y_pred) / len(y_pred), np.bincount(y_test) / len(y_test)
tscv = TimeSeriesSplit(n_splits=12)
len(X_train) / 12
scores = []
for idx1, idx2 in tscv.split(X_train):
clf = pv_clf.RandomForestClassifierPV()
clf.fit(X_train[idx1], y_train[idx1])
pred = clf.predict(X_train[idx2])
scores.append(metrics.accuracy_score(y_train[idx2], pred))
print(np.bincount(pred) / len(pred))
np.mean(scores), np.std(scores)